
rm(list=ls())
library(foreign)

set.seed(21042020)

###
### IRT estimator
###
source("IRT_mixing/mixture_irt.R")

###
### load all large survey data
###

#load("bigsurveys_recoded4.RData")
data <- read.dta("bigsurveys_recoded4.dta")

policy_ind <- 3:329
years <- unique(data$source)

table(years)

###
### Remove modules from datasets
###

for(year in years){
    tmp <- data[,policy_ind]
    tmp <- tmp[data$source==year,]
    allna <- is.na(colMeans(tmp,na.rm=T))
    tmp <- tmp[,! allna]
    
    # we need fewer responses than this
    # to determine that an item was asked
    # on a module
    threshold <- 5000
    
    module_items <- c()
    for(i in 1:dim(tmp)[2]){
      vec <- ifelse(tmp[,i]==0,1, tmp[,i])
      if(sum(vec,na.rm=T) < threshold){
        module_items <- c(module_items,i)
      }
    }
    if(length(module_items) > 1){
       module_people <- which(! is.na(rowMeans(tmp[,module_items],na.rm=T)))
    } else {module_people <- c()}
    
    
    print(year)
    print(c("Total people, items:",dim(tmp))) 
    print(c("Module people:",length(module_people))) 
    print(c("Module items:",length(module_items)))
    
    # now that module items have been identified,
    # NA those items for those modules
    if(length(module_items) > 0){
       ind <- which(names(data) %in% names(tmp)[module_items])
       data[data$source==year,ind] <- NA
    }
}


###
### run em_mix_irt on every included data set, and append estimates
###

# store all of the results in a list
reslist <- list()
# store ideal points
data$x <- rep(NA,dim(data)[1])
# store probability of each voter type
data$w1 <- rep(NA,dim(data)[1])
data$w2 <- rep(NA,dim(data)[1])
data$w3 <- rep(NA,dim(data)[1])
# store individual log likelihoods
data$lk1 <- rep(NA,dim(data)[1])
data$lk2 <- rep(NA,dim(data)[1])
for(i in 1:length(years)){
   year <- years[i]

   print(year)
   mat <- apply(data[data$source==year,policy_ind],2,as.numeric)
   exclude <- ! colnames(mat) %in% c("cces2012_immigration5","cces2012_immigration6")
   notna <- which(! is.na(colMeans(mat,na.rm=T)) & exclude)
   print(length(notna))
   
   mat <- mat[, notna]
   res <- em_mix_irt(mat, iter=30)
   reslist[[year]] <- res
   data$x[data$source==year] <- res$irt$x
   data$w1[data$source==year] <- res$w[,1]
   data$w2[data$source==year] <- res$w[,2]
   data$w3[data$source==year] <- res$w[,3]
   data$lk1[data$source==year] <- res$irt$lk
   data$lk2[data$source==year] <- res$ivp$lk
}

###
### Identify the direction using pid7
###

data$pid7[data$pid7=="__NA__"] <- NA
data$pid7 <- as.numeric(data$pid7)
sources <- unique(data$source)

for(i in sources){
      ind <- data$source==i
      direction <- sign(cor(data$pid7[ind],data$x[ind],use="pairwise.complete.obs"))
      data$x[ind] <- direction*data$x[ind]
}

for(i in sources){
      ind <- data$source==i
      print(i)
      print(cor(data$pid7[ind],data$x[ind],use="pairwise.complete.obs"))
}

###
### Save the results
###

save(reslist,file="all_results4.RData")

save(data,file="bigsurveys_recoded_plus_estimates4.RData")

write.dta(data,file="bigsurveys_recoded4.dta")


###
### Evaluate dispersion in the discrimination parameters
###

for(i in 1:length(reslist)){
    year <- years[i]
    print(years[i])

    mat <- apply(data[data$source==year,policy_ind],2,as.numeric)
    exclude <- ! colnames(mat) %in% c("cces2012_immigration5","cces2012_immigration6")
    notna <- which(! is.na(colMeans(mat,na.rm=T)) & exclude)
    print(paste("items:",length(notna)))   
    mat <- mat[, notna]
    
    print(paste("max discrimination:",round(max(abs(reslist[[i]]$irt$b)),3)))
    ind <- which(abs(reslist[[i]]$irt$b) > 1/2*max(abs(reslist[[i]]$irt$b)))
    print(paste("items with > half discrimination of max:",length(ind)))
    print(colnames(mat)[ind])
    ind <- which(abs(reslist[[i]]$irt$b) > 1/3*max(abs(reslist[[i]]$irt$b)))
    print(paste("items with > 1/3 discrimination of max:",length(ind)))
    print(colnames(mat)[ind])
    print("--------------")
}


###
###  
###

for(i in 1:length(reslist)){
    year <- years[i]
    print(years[i])
    
    sdx <- sd(reslist[[i]]$irt$x)
    meanx <- mean(reslist[[i]]$irt$x)
    reslist[[i]]$irt$a <- reslist[[i]]$irt$a +
           reslist[[i]]$irt$b*meanx
    reslist[[i]]$irt$b <- reslist[[i]]$irt$b*sdx
reslist[[i]]$irt$x <- (reslist[[i]]$irt$x-meanx)/sdx
}

###
### Graphs of the item characteristic curves
###

pdf("item_curves.pdf")
for(i in 1:length(reslist)){
    year <- years[i]
    print(years[i])

    a <- reslist[[i]]$irt$a[1]
    b <- reslist[[i]]$irt$b[1]
    curve(pnorm(a+b*x),from=-3,to=3,
	  lwd=.5,axes=F,xlab="Ideal Point",
	  ylab="",main=year,ylim=c(0,1))
    axis(1)
    rug(reslist[[i]]$irt$x)
    for(j in 2:length(reslist[[i]]$irt$a)){
       a <- reslist[[i]]$irt$a[j]
       b <- reslist[[i]]$irt$b[j]
       curve(pnorm(a+b*x),from=-3,to=3,add=T,
	     lwd=.5)
    }

print(summary(reslist[[i]]$irt$x))
}
dev.off()



